1 Set Working Directory

knitr::opts_knit$set(root.dir = '/Users/kpitz/Projects/Gulf_of_California/Data_for_upload/')
getwd()
## [1] "/Users/kpitz/Projects/Gulf_of_California/Data_for_upload"

2 Import packages

library(biomformat)
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
library(car)
## Loading required package: carData
library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
library(ggplot2)
library(ggdendro)
library(phyloseq)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-5
library(clustsig) #SIMPER/SIMPROF
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
library(scales)
library(grid)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(magrittr) # double pipes with dplyr
library(tibble)
library(cluster)

3 Read in Data, Rarefy, Export Rarefied Data

3.1 COI

GOC_COI <- merge_phyloseq(otu_table(as.matrix(read.csv(file = "USEARCH/Raw_Data/GOC_Geller_COI_otu_unrare_040319.csv", row.names = 1,check.names = FALSE)), taxa_are_rows = TRUE),tax_table(as.matrix(read.csv(file = "USEARCH/Raw_Data/GOC_Geller_COI_taxa_unrare_040319.csv", row.names = 1))),sample_data(read.csv(file = "Combined_PCTD_Metadata_043019.csv", row.names = 1)))

GOC_COI
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1596 taxa and 20 samples ]
## sample_data() Sample Data:       [ 20 samples by 54 sample variables ]
## tax_table()   Taxonomy Table:    [ 1596 taxa by 7 taxonomic ranks ]

3.2 18S

GOC_18S <- merge_phyloseq(otu_table(as.matrix(read.csv(file = "USEARCH/Raw_Data/GOC_Geller_18S_otu_unrare_040319.csv", row.names = 1,check.names = FALSE)), taxa_are_rows = TRUE),tax_table(as.matrix(read.csv(file = "USEARCH/Raw_Data/GOC_Geller_18S_taxa_unrare_040319.csv", row.names = 1))),sample_data(read.csv(file = "Combined_PCTD_Metadata_043019.csv", row.names = 1)))
GOC_18S
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 342 taxa and 20 samples ]
## sample_data() Sample Data:       [ 20 samples by 54 sample variables ]
## tax_table()   Taxonomy Table:    [ 342 taxa by 7 taxonomic ranks ]

4 Begin Here to Recreate Figures using Rarefied Data

Import Rarefied data to edit figures

#Data previously Rarefied
GOC_COI_r <- merge_phyloseq(otu_table(as.matrix(read.csv(file = "USEARCH/Rarefied_Data/GOC_Geller_COI_otu_rare_032119.csv", row.names = 1,check.names = FALSE)), taxa_are_rows = TRUE),tax_table(as.matrix(read.csv(file = "USEARCH/Rarefied_Data/GOC_Geller_COI_taxa_rare_032119.csv", row.names = 1))),sample_data(read.csv(file = "Combined_PCTD_Metadata_043019.csv", row.names = 1)))

GOC_COI_r
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1596 taxa and 20 samples ]
## sample_data() Sample Data:       [ 20 samples by 54 sample variables ]
## tax_table()   Taxonomy Table:    [ 1596 taxa by 7 taxonomic ranks ]
#Data previously Rarefied
GOC_18S_r <- merge_phyloseq(otu_table(as.matrix(read.csv(file = "USEARCH/Rarefied_Data/GOC_Geller_18S_otu_rare_032119.csv", row.names = 1,check.names = FALSE)), taxa_are_rows = TRUE),tax_table(as.matrix(read.csv(file = "USEARCH/Rarefied_Data/GOC_Geller_18S_taxa_rare_032119.csv", row.names = 1))),sample_data(read.csv(file = "Combined_PCTD_Metadata_043019_noGOC2b.csv", row.names = 1)))

GOC_18S_r
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 341 taxa and 19 samples ]
## sample_data() Sample Data:       [ 19 samples by 54 sample variables ]
## tax_table()   Taxonomy Table:    [ 341 taxa by 7 taxonomic ranks ]
vegan_otu <- function(physeq) {
  OTU <- otu_table(physeq)
  if (taxa_are_rows(OTU)) {
    OTU <- t(OTU)
  }
  return(as(OTU, "matrix"))
}


#save rarefied; merged data
vegan_matrix_18S=vegan_otu(GOC_18S_r)
otu_tab_18S=otu_table(GOC_18S_r)
tax_tab_18S= tax_table(GOC_18S_r)
samp_dat_18S <- sample_data(GOC_18S_r)

vegan_matrix_COI=vegan_otu(GOC_COI_r)
otu_tab_COI=otu_table(GOC_COI_r)
tax_tab_COI= tax_table(GOC_COI_r)
samp_dat_COI <- sample_data(GOC_COI_r)

5 Transform sample metadata

#Transform Numeric Data in metadata
sample_data(GOC_COI_r) <- transform(sample_data(GOC_COI_r), TMP_C = as.numeric(TMP),
                                NO3_um = as.numeric(NO3),
                                CHL_GFF = as.numeric(CHL_GFF),
                                oxy_ml = as.numeric(OXY_ML),
                                sigma_theta = as.numeric(SIG_T),
                                DEPTH_M = as.numeric(DEPTH),
                                Fluor = as.numeric(FLUOR),
                                SAL = as.numeric(SAL)
)
#check modes of data
sapply(sample_data(GOC_COI_r), mode)
##              order       tag_sequence  primer_sequence_F 
##          "numeric"          "numeric"          "numeric" 
##  primer_sequence_R  library_tag_combo            library 
##          "numeric"          "numeric"          "numeric" 
##        sample_type              locus         tag_number 
##          "numeric"          "numeric"          "numeric" 
##                 R1                 R2          Treatment 
##          "numeric"          "numeric"          "numeric" 
##        Time_of_Day        Description      Description_3 
##          "numeric"          "numeric"          "numeric" 
##               site                SEQ             BOTTLE 
##          "numeric"          "numeric"          "numeric" 
##              DEPTH             CRUISE           PLATFORM 
##          "numeric"          "numeric"          "numeric" 
##            DEC_LAT           DEC_LONG                TMP 
##          "numeric"          "numeric"          "numeric" 
##                SAL            CHL_GFF           PRESSURE 
##          "numeric"          "numeric"          "numeric" 
##                NO3             OXY_ML               RDEP 
##          "numeric"          "numeric"          "numeric" 
##          TRANSMISS              SIG_T              FLUOR 
##          "numeric"          "numeric"          "numeric" 
##          DATE_TIME             cruise          SEQAvg_dg 
##          "numeric"          "numeric"          "numeric" 
##           AvgOfTMP         StDevOfTMP         CountOfTMP 
##          "numeric"          "numeric"          "numeric" 
##          AvgOfSAL1         StDevOfSAL         CountOfSAL 
##          "numeric"          "numeric"          "numeric" 
##          AvgOfCHLA        StDevOfCHLA        CountOfCHLA 
##          "numeric"          "numeric"          "numeric" 
##       AvgOfOXY_ML1     CountOfOXY_ML1      CountOfOXY_ML 
##          "numeric"          "numeric"          "numeric" 
##     AvgOfTRANSMISS   StDevOfTRANSMISS   CountOfTRANSMISS 
##          "numeric"          "numeric"          "numeric" 
##   AvgOfSIGMA_THETA StDevOfSIGMA_THETA CountOfSIGMA_THETA 
##          "numeric"          "numeric"          "numeric" 
##              TMP_C             NO3_um             oxy_ml 
##          "numeric"          "numeric"          "numeric" 
##        sigma_theta            DEPTH_M              Fluor 
##          "numeric"          "numeric"          "numeric"
samp_dat <- sample_data(GOC_COI_r)
#Transform Numeric Data in metadata
sample_data(GOC_18S_r) <- transform(sample_data(GOC_18S_r), TMP_C = as.numeric(TMP),
                                NO3_um = as.numeric(NO3),
                                CHL_GFF = as.numeric(CHL_GFF),
                                oxy_ml = as.numeric(OXY_ML),
                                sigma_theta = as.numeric(SIG_T),
                                DEPTH_M = as.numeric(DEPTH),
                                Fluor = as.numeric(FLUOR),
                                SAL = as.numeric(SAL)
)
#check modes of data
sapply(sample_data(GOC_18S_r), mode)
##              order       tag_sequence  primer_sequence_F 
##          "numeric"          "numeric"          "numeric" 
##  primer_sequence_R  library_tag_combo            library 
##          "numeric"          "numeric"          "numeric" 
##        sample_type              locus         tag_number 
##          "numeric"          "numeric"          "numeric" 
##                 R1                 R2          Treatment 
##          "numeric"          "numeric"          "numeric" 
##        Time_of_Day        Description      Description_3 
##          "numeric"          "numeric"          "numeric" 
##               site                SEQ             BOTTLE 
##          "numeric"          "numeric"          "numeric" 
##              DEPTH             CRUISE           PLATFORM 
##          "numeric"          "numeric"          "numeric" 
##            DEC_LAT           DEC_LONG                TMP 
##          "numeric"          "numeric"          "numeric" 
##                SAL            CHL_GFF           PRESSURE 
##          "numeric"          "numeric"          "numeric" 
##                NO3             OXY_ML               RDEP 
##          "numeric"          "numeric"          "numeric" 
##          TRANSMISS              SIG_T              FLUOR 
##          "numeric"          "numeric"          "numeric" 
##          DATE_TIME             cruise          SEQAvg_dg 
##          "numeric"          "numeric"          "numeric" 
##           AvgOfTMP         StDevOfTMP         CountOfTMP 
##          "numeric"          "numeric"          "numeric" 
##          AvgOfSAL1         StDevOfSAL         CountOfSAL 
##          "numeric"          "numeric"          "numeric" 
##          AvgOfCHLA        StDevOfCHLA        CountOfCHLA 
##          "numeric"          "numeric"          "numeric" 
##       AvgOfOXY_ML1     CountOfOXY_ML1      CountOfOXY_ML 
##          "numeric"          "numeric"          "numeric" 
##     AvgOfTRANSMISS   StDevOfTRANSMISS   CountOfTRANSMISS 
##          "numeric"          "numeric"          "numeric" 
##   AvgOfSIGMA_THETA StDevOfSIGMA_THETA CountOfSIGMA_THETA 
##          "numeric"          "numeric"          "numeric" 
##              TMP_C             NO3_um             oxy_ml 
##          "numeric"          "numeric"          "numeric" 
##        sigma_theta            DEPTH_M              Fluor 
##          "numeric"          "numeric"          "numeric"
samp_dat <- sample_data(GOC_18S_r)

6 Compare Diversity

Simpson, Shannon Diversity, Chao1 Richness

#Phyloseq
#18S
p_oBiom <- plot_richness(GOC_COI_r,x="Description", measures=c("Shannon","Simpson", "Chao1"), nrow=1, color='Time_of_Day') +
  theme(text = element_text(size=16), strip.text.x = element_text(size = 16), axis.text = element_text(size = rel(1), colour = "black")) +
  xlab("Sampling Region") + 
  ggtitle("COI Rarefied")+theme_bw()+scale_x_discrete(limits=c('PCNorth','EUNorth','EUSouth'))

p_oBiom + geom_boxplot(data=p_oBiom$data, aes(x=Description, y=value, color=NULL), alpha=0.1)
## Warning: Removed 40 rows containing missing values (geom_errorbar).

#COI
p_oBiom <- plot_richness(GOC_18S_r,x="Description", measures=c("Shannon","Simpson", "Chao1"), nrow=1,color='Time_of_Day') +
  theme(text = element_text(size=16), strip.text.x = element_text(size = 16), axis.text = element_text(size = rel(1), colour = "black")) +
  xlab("Sampling Region") + 
  ggtitle("18S Rarefied")+theme_bw()+scale_x_discrete(limits=c('PCNorth','EUNorth','EUSouth'))

p_oBiom + geom_boxplot(data=p_oBiom$data, aes(x=Description, y=value, color=NULL), alpha=0.1)
## Warning: Removed 38 rows containing missing values (geom_errorbar).

Create Dataframe of Shannon Diversity Data The following section draws from the tutorial: http://www.sthda.com/english/wiki/one-way-anova-test-in-r

6.1 COI

#Shannon
H <- diversity(vegan_matrix_COI)
H <- bind_rows(H)
H <- tbl_df(H)

H %<>%
   rownames_to_column %>%
   gather(var, value, -rowname) %>% 
   spread(rowname, value)

#Need to group by description
samp_df <- tbl_df(samp_dat_COI)
## Warning in class(x) <- c(subclass, tibble_class): Setting class(x) to
## multiple strings ("tbl_df", "tbl", ...); result will no longer be an S4
## object
samp_df$sample_names <- rownames(samp_dat_COI)
#join by index
joined_df <- inner_join(H,samp_df, by=c('var'='sample_names'))

joined_df$Description <- ordered(joined_df$Description,
                         levels = c("PCNorth", "EUNorth", "EUSouth"))
levels(joined_df$Description)
## [1] "PCNorth" "EUNorth" "EUSouth"
joined_df = rename(joined_df, 'Diversity'='1')

joined_df$Diversity <- as.numeric(joined_df$Diversity)


ggboxplot(joined_df, x = "Description", y = "Diversity", 
          color = "Description", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          order = c("PCNorth", "EUNorth", "EUSouth"),
          ylab = "Diversity", xlab = "Description")

ggline(joined_df, x = "Description", y = "Diversity", 
       add = c("mean_se", "jitter"), 
       order = c("PCNorth", "EUNorth", "EUSouth"),
       ylab = "Diversity", xlab = "Description")

ggline(joined_df, x = "Time_of_Day", y = "Diversity", 
       add = c("mean_se", "jitter"), 
       order = c("day", "night", "transition"),
       color='Description',
       ylab = "Diversity", xlab = "Time_of_Day")
## Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
## Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced

## Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
## Warning: Removed 3 rows containing missing values (geom_errorbar).

ANOVA

print('Region')
## [1] "Region"
# Compute the analysis of variance
res.aov <- aov(Diversity ~ Description, data = joined_df)
# Summary of the analysis
summary(res.aov)
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## Description  2  5.829  2.9146   10.52 0.00106 **
## Residuals   17  4.711  0.2771                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Diversity ~ Description, data = joined_df)
## 
## $Description
##                       diff        lwr       upr     p adj
## EUNorth-PCNorth 0.07540415 -0.7786545 0.9294628 0.9721624
## EUSouth-PCNorth 1.11613455  0.3764980 1.8557711 0.0033292
## EUSouth-EUNorth 1.04073040  0.3010939 1.7803669 0.0058091
#Another method
#library(multcomp)
#perform multiple comparison procedures for an ANOVA. glht stands for general linear hypothesis tests
summary(glht(res.aov, linfct = mcp(Description = "Tukey")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = Diversity ~ Description, data = joined_df)
## 
## Linear Hypotheses:
##                        Estimate Std. Error t value Pr(>|t|)   
## EUNorth - PCNorth == 0   0.0754     0.3329   0.226  0.97199   
## EUSouth - PCNorth == 0   1.1161     0.2883   3.871  0.00322 **
## EUSouth - EUNorth == 0   1.0407     0.2883   3.610  0.00596 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
print('Time of Day')
## [1] "Time of Day"
# Compute the analysis of variance
res.aov <- aov(Diversity ~ Time_of_Day, data = joined_df)
# Summary of the analysis
summary(res.aov)
##             Df Sum Sq Mean Sq F value Pr(>F)
## Time_of_Day  2  1.463  0.7313    1.37  0.281
## Residuals   17  9.077  0.5340
#TukeyHSD(res.aov)

#Another method
#library(multcomp)
#perform multiple comparison procedures for an ANOVA. glht stands for general linear hypothesis tests
summary(glht(res.aov, linfct = mcp(Time_of_Day = "Tukey")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = Diversity ~ Time_of_Day, data = joined_df)
## 
## Linear Hypotheses:
##                         Estimate Std. Error t value Pr(>|t|)
## night - day == 0          0.3883     0.4323   0.898    0.647
## transition - day == 0    -0.2183     0.4717  -0.463    0.888
## transition - night == 0  -0.6066     0.3773  -1.608    0.268
## (Adjusted p values reported -- single-step method)

If ANOVA result is significant, compute Tukey HSD (Tukey Honest Significant Differences, R function: TukeyHSD()) for performing multiple pairwise-comparison between the means of groups.

The function TukeyHD() takes the fitted ANOVA as an argument. diff: difference between means of the two groups lwr, upr: the lower and the upper end point of the confidence interval at 95% (default) p adj: p-value after adjustment for the multiple comparisons.

Homogeneity of variances

res.aov <- aov(Diversity ~ Description, data = joined_df)
#Homogeneity of variances
plot(res.aov, 1)

leveneTest(Diversity ~ Description, data = joined_df)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  0.5453 0.5895
##       17

From the output above we can see that the p-value is not less than the significance level of 0.05. This means that there is no evidence to suggest that the variance across groups is statistically significantly different. Therefore, we can assume the homogeneity of variances in the different treatment groups. http://www.sthda.com/english/wiki/one-way-anova-test-in-r

Homogeneity of variances

res.aov <- aov(Diversity ~ Time_of_Day, data = joined_df)
#Homogeneity of variances
plot(res.aov, 1)

leveneTest(Diversity ~ Time_of_Day, data = joined_df)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.5705 0.2367
##       17

Look at correlation between environmental variables and diversity

#look at Temperature vs Diversity

#average by site
avgdf = joined_df %>% group_by(site, TMP) %>% summarize(Diversity = mean(Diversity))
avgdf
## # A tibble: 15 x 3
## # Groups:   site [15]
##    site     TMP Diversity
##    <fct>  <dbl>     <dbl>
##  1 CP23    20.9      3.42
##  2 GOC2.0  24.1      3.55
##  3 UC1     12.6      2.14
##  4 UC10    17.2      4.04
##  5 UC12    18.9      3.05
##  6 UC13    19.7      4.12
##  7 UC14    20.3      4.24
##  8 UC15    20.6      3.50
##  9 UC2     12.7      2.49
## 10 UC3     12.7      2.65
## 11 UC4     14.7      2.22
## 12 UC5     15.3      3.60
## 13 UC6     15.7      2.01
## 14 UC7     15.5      3.12
## 15 UC9     16.3      2.00
res <- cor.test(avgdf$Diversity, avgdf$TMP, 
                    method = "pearson")
res
## 
##  Pearson's product-moment correlation
## 
## data:  avgdf$Diversity and avgdf$TMP
## t = 3.0555, df = 13, p-value = 0.009203
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2007232 0.8704835
## sample estimates:
##     cor 
## 0.64651
ggscatter(avgdf, x = "TMP", y = "Diversity", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "Temperature", ylab = "Shannon Diversity", label="site", font.label=(8))

ggsave('Figures/Bray_Temp_Corr_USEARCH_COI.png')
## Saving 7 x 5 in image

Compare Diversity means 18S

Create Dataframe of Shannon Diversity Data The following section draws from the tutorial: http://www.sthda.com/english/wiki/one-way-anova-test-in-r

6.2 18S

#Shannon
H <- diversity(vegan_matrix_18S)
H <- bind_rows(H)
H <- tbl_df(H)

H %<>%
   rownames_to_column %>%
   gather(var, value, -rowname) %>% 
   spread(rowname, value)

#Need to group by description
samp_df <- tbl_df(samp_dat_18S)
## Warning in class(x) <- c(subclass, tibble_class): Setting class(x) to
## multiple strings ("tbl_df", "tbl", ...); result will no longer be an S4
## object
samp_df$sample_names <- rownames(samp_dat_18S)
#join by index
joined_df <- inner_join(H,samp_df, by=c('var'='sample_names'))

joined_df$Description <- ordered(joined_df$Description,
                         levels = c("PCNorth", "EUNorth", "EUSouth"))
levels(joined_df$Description)
## [1] "PCNorth" "EUNorth" "EUSouth"
joined_df = rename(joined_df, 'Diversity'='1')

joined_df$Diversity <- as.numeric(joined_df$Diversity)


ggboxplot(joined_df, x = "Description", y = "Diversity", 
          color = "Description", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          order = c("PCNorth", "EUNorth", "EUSouth"),
          ylab = "Diversity", xlab = "Description")

ggline(joined_df, x = "Description", y = "Diversity", 
       add = c("mean_se", "jitter"), 
       order = c("PCNorth", "EUNorth", "EUSouth"),
       ylab = "Diversity", xlab = "Description")

ggline(joined_df, x = "Time_of_Day", y = "Diversity", 
       add = c("mean_se", "jitter"), 
       order = c("day", "night", "transition"),
       color='Description',
       ylab = "Diversity", xlab = "Time_of_Day")
## Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
## Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced

## Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
## Warning: Removed 3 rows containing missing values (geom_errorbar).

ANOVA

print('Region')
## [1] "Region"
# Compute the analysis of variance
res.aov <- aov(Diversity ~ Description, data = joined_df)
# Summary of the analysis
summary(res.aov)
##             Df Sum Sq Mean Sq F value Pr(>F)
## Description  2 0.1931 0.09653   0.843  0.449
## Residuals   16 1.8322 0.11451
#TukeyHSD(res.aov)

#Another method
#library(multcomp)
#perform multiple comparison procedures for an ANOVA. glht stands for general linear hypothesis tests
summary(glht(res.aov, linfct = mcp(Description = "Tukey")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = Diversity ~ Description, data = joined_df)
## 
## Linear Hypotheses:
##                        Estimate Std. Error t value Pr(>|t|)
## EUNorth - PCNorth == 0  0.15498    0.21402   0.724    0.752
## EUSouth - PCNorth == 0  0.24507    0.18875   1.298    0.415
## EUSouth - EUNorth == 0  0.09009    0.18875   0.477    0.882
## (Adjusted p values reported -- single-step method)
print('Time of Day')
## [1] "Time of Day"
# Compute the analysis of variance
res.aov <- aov(Diversity ~ Time_of_Day, data = joined_df)
# Summary of the analysis
summary(res.aov)
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## Time_of_Day  2 0.9976  0.4988   7.766 0.00439 **
## Residuals   16 1.0276  0.0642                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Diversity ~ Time_of_Day, data = joined_df)
## 
## $Time_of_Day
##                         diff         lwr        upr     p adj
## night-day         0.47178542  0.07881899  0.8647519 0.0179327
## transition-day    0.02187715 -0.40023651  0.4439908 0.9901947
## transition-night -0.44990827 -0.79456263 -0.1052539 0.0103247
#Another method
#library(multcomp)
#perform multiple comparison procedures for an ANOVA. glht stands for general linear hypothesis tests
summary(glht(res.aov, linfct = mcp(Time_of_Day = "Tukey")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = Diversity ~ Time_of_Day, data = joined_df)
## 
## Linear Hypotheses:
##                         Estimate Std. Error t value Pr(>|t|)  
## night - day == 0         0.47179    0.15229   3.098   0.0178 *
## transition - day == 0    0.02188    0.16359   0.134   0.9901  
## transition - night == 0 -0.44991    0.13357  -3.368   0.0103 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

If ANOVA result is significant, compute Tukey HSD (Tukey Honest Significant Differences, R function: TukeyHSD()) for performing multiple pairwise-comparison between the means of groups.

The function TukeyHD() takes the fitted ANOVA as an argument. diff: difference between means of the two groups lwr, upr: the lower and the upper end point of the confidence interval at 95% (default) p adj: p-value after adjustment for the multiple comparisons.

Homogeneity of variances

res.aov <- aov(Diversity ~ Description, data = joined_df)
#Homogeneity of variances
plot(res.aov, 1)

leveneTest(Diversity ~ Description, data = joined_df)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.2528 0.3123
##       16

From the output above we can see that the p-value is not less than the significance level of 0.05. This means that there is no evidence to suggest that the variance across groups is statistically significantly different. Therefore, we can assume the homogeneity of variances in the different treatment groups. http://www.sthda.com/english/wiki/one-way-anova-test-in-r

Homogeneity of variances

res.aov <- aov(Diversity ~ Time_of_Day, data = joined_df)
#Homogeneity of variances
plot(res.aov, 1)

leveneTest(Diversity ~ Time_of_Day, data = joined_df)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value   Pr(>F)   
## group  2  6.5898 0.008172 **
##       16                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variance between groups is statistically different, violates assumption of homogeneity of variances.

Look at correlation between environmental variables and diversity

#look at Temperature vs Diversity

#average by site
avgdf = joined_df %>% group_by(site, TMP) %>% summarize(Diversity = mean(Diversity))
avgdf
## # A tibble: 15 x 3
## # Groups:   site [15]
##    site     TMP Diversity
##    <fct>  <dbl>     <dbl>
##  1 CP23    20.9      2.89
##  2 GOC2.0  24.1      2.44
##  3 UC1     12.6      2.28
##  4 UC10    17.2      2.11
##  5 UC12    18.9      2.14
##  6 UC13    19.7      2.29
##  7 UC14    20.3      2.37
##  8 UC15    20.6      2.29
##  9 UC2     12.7      2.09
## 10 UC3     12.7      2.27
## 11 UC4     14.7      2.05
## 12 UC5     15.3      2.94
## 13 UC6     15.7      2.02
## 14 UC7     15.5      2.85
## 15 UC9     16.3      2.09
res <- cor.test(avgdf$Diversity, avgdf$TMP, 
                    method = "pearson")
res
## 
##  Pearson's product-moment correlation
## 
## data:  avgdf$Diversity and avgdf$TMP
## t = 0.80725, df = 13, p-value = 0.434
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3308041  0.6571921
## sample estimates:
##       cor 
## 0.2184825
ggscatter(avgdf, x = "TMP", y = "Diversity", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "Temperature", ylab = "Shannon Diversity", label="site", font.label=(8))

ggsave('Figures/Bray_Temp_Corr_USEARCH_18S.png')
## Saving 7 x 5 in image

7 Dendrogram Plots

SIMPROF Analysis 18S

#create hierarchical cluster diagram - example
dist.mat <- vegdist(vegan_matrix_18S, method="bray", binary=FALSE)
clust.res<-hclust(dist.mat, method="average")
plot(clust.res)

#SIMPROF: gives clusters that pass a significance threshold
########################################
simprof_mat_18S = simprof(vegan_matrix_18S, num.expected=100, num.simulated=99, 
                      method.cluster="complete", method.distance = "braycurtis", 
                      #method.transform = "squareroot",
                      alpha = 0.05, sample.orientation = "row")
## Warning: This version of the Bray-Curtis index does not use
## standardization.
## Warning: To use the standardized version, use "actual-braycurtis".
## Warning: See the help documentation for more information.
#visualize with colored hierarchical cluster diagram by sample name
plot_sim <- simprof.plot(simprof_mat_18S)

simprof_mat_18S
## $numgroups
## [1] 8
## 
## $pval
##       [,1] [,2]       [,3]
##  [1,]   -1   -2 1.00000000
##  [2,]  -12  -13 1.00000000
##  [3,]  -14    2 0.00000000
##  [4,]   -7   -8         NA
##  [5,]   -6  -11 1.00000000
##  [6,]   -3   -9 1.00000000
##  [7,]  -15  -17         NA
##  [8,]  -16  -18 1.00000000
##  [9,]  -10    4         NA
## [10,]    3    5 0.00000000
## [11,]  -19    7 0.68686869
## [12,]   10   11 0.00000000
## [13,]   -4   -5         NA
## [14,]    1    6 0.00000000
## [15,]    9   13 0.38383838
## [16,]    8   12 0.01010101
## [17,]   14   15 0.01010101
## [18,]   16   17 0.00000000
## 
## $hclust
## 
## Call:
## hclust(d = rawdata.dist, method = method.cluster)
## 
## Cluster method   : complete 
## Number of objects: 19 
## 
## 
## $significantclusters
## $significantclusters[[1]]
## [1] "UC5" "UC7"
## 
## $significantclusters[[2]]
## [1] "UC3_2"
## 
## $significantclusters[[3]]
## [1] "UC3_1" "UC3"  
## 
## $significantclusters[[4]]
## [1] "UC1" "UC2"
## 
## $significantclusters[[5]]
## [1] "UC9" "UC4" "UC6"
## 
## $significantclusters[[6]]
## [1] "CP23_1" "CP23"  
## 
## $significantclusters[[7]]
## [1] "CP23_2" "UC14"  
## 
## $significantclusters[[8]]
## [1] "UC15"  "UC12"  "UC13"  "GOC2a" "UC10"
#simprof has 4 objects (list of lists)
#how to call a significant cluster:
#simprof_mat[[4]][[1]]

#Map these results to an nmds plot
simprofCLUSTERS_18S = data.frame()

#only considers clusters of more than two samples
for(j in 1:length(simprof_mat_18S$significantclusters)){
  if(length(simprof_mat_18S$significantclusters[[j]]) > 1){
    simprofCLUSTERS_18S <- rbind(simprofCLUSTERS_18S, cbind(j, simprof_mat_18S$significantclusters[[j]]))
  }
}

#create df with samples as index and clusterID as "simprofCLUSTER"
rownames(simprofCLUSTERS_18S) <- simprofCLUSTERS_18S[,2]
colnames(simprofCLUSTERS_18S) <- c("simprofCLUSTER", "group")

simprofCLUSTERS_18S
##        simprofCLUSTER  group
## UC5                 1    UC5
## UC7                 1    UC7
## UC3_1               3  UC3_1
## UC3                 3    UC3
## UC1                 4    UC1
## UC2                 4    UC2
## UC9                 5    UC9
## UC4                 5    UC4
## UC6                 5    UC6
## CP23_1              6 CP23_1
## CP23                6   CP23
## CP23_2              7 CP23_2
## UC14                7   UC14
## UC15                8   UC15
## UC12                8   UC12
## UC13                8   UC13
## GOC2a               8  GOC2a
## UC10                8   UC10
#Plot with ggplot2

library(ggplot2)
library(ggdendro)

dendr <- dendro_data(simprof_mat_18S[[3]], type="rectangle") 

#Define cluster based on similarity percent - good for plotting
clust <- cutree(simprof_mat_18S[[3]], h = 40)               # find 'cut' clusters (k) or choose level (h) numeric scalar or vector with heights where the tree should be cut.
clust2 <- cutree(simprof_mat_18S[[3]], h = 60)
clust3 <- cutree(simprof_mat_18S[[3]], h = 20)  
clust4 <- cutree(simprof_mat_18S[[3]], h = 80)  

clust.df <- data.frame(label = names(clust), cluster_40 = clust, cluster_60 = clust2, cluster_20 = clust3, cluster_80 = clust4)
sapply(clust.df, mode)
##      label cluster_40 cluster_60 cluster_20 cluster_80 
##  "numeric"  "numeric"  "numeric"  "numeric"  "numeric"
clust.df$label <- as.character(clust.df$label)
clust.df <- clust.df[order(clust.df$label),]

#join sample data with cluster df and simprofCLUSTERS df:
tree_scores <- merge(clust.df, samp_dat,by=0)
rownames(tree_scores) <- tree_scores$Row.names
#remove duplicated column after assigning index
tree_scores$Row.names <- NULL
#merge with simprof cluster df
tree_scores <- merge(tree_scores, simprofCLUSTERS_18S,by=0, all=TRUE )


#dendr$labels is df of x,y and attributes -> add to it with metadata (tree_scores)
tree_data_18S <- merge(dendr$labels, tree_scores,by="label")
tree_data_18S$Site <- tree_data_18S$Row.names

svglite::svglite("Figures/Dendro_18S_USEARCH.svg", height=4.5, width=4.5)

cbPalette <- c("#56B4E9", "#E69F00", "#009E73")

ggplot() + 
  geom_segment(data=segment(dendr), aes(x=x, y=y, xend=xend, yend=yend)) +
  geom_point(data=tree_data_18S, aes(x=x, y=y, color=Description, shape=Time_of_Day),alpha=1, size=5)+ 
  geom_text(data=tree_data_18S, aes(x=x, y=y, label=Site, hjust=-.4), size=3) +
  #geom_text(data=tree_data_18S, aes(x=x, y=y, label=simprofCLUSTER, hjust=3), size=5) +
  geom_hline(yintercept=20, color="blue", linetype = "longdash", alpha=0.8) +
  geom_hline(yintercept=40, color="green", linetype = "longdash", alpha=0.8) +
  geom_hline(yintercept=60, color="orange", linetype = "longdash", alpha=0.8) +
  coord_flip() + scale_y_reverse(expand=c(2, 0)) + 
  scale_colour_manual(values=cbPalette) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank(),
        axis.text.y=element_blank(),
        axis.title.y=element_blank(),
        panel.background=element_rect(fill="white"),
        panel.grid=element_blank())

dev.off()
## quartz_off_screen 
##                 2
ggplot() + 
  geom_segment(data=segment(dendr), aes(x=x, y=y, xend=xend, yend=yend)) +
  geom_point(data=tree_data_18S, aes(x=x, y=y, color=Description, shape=Time_of_Day),alpha=1, size=5)+ 
  geom_text(data=tree_data_18S, aes(x=x, y=y, label=Site, hjust=-.4), size=3) +
  #geom_text(data=tree_data_18S, aes(x=x, y=y, label=simprofCLUSTER, hjust=3), size=5) +
  geom_hline(yintercept=20, color="blue", linetype = "longdash", alpha=0.8) +
  geom_hline(yintercept=40, color="green", linetype = "longdash", alpha=0.8) +
  geom_hline(yintercept=60, color="orange", linetype = "longdash", alpha=0.8) +
  coord_flip() + scale_y_reverse(expand=c(2, 0)) + 
  scale_colour_manual(values=cbPalette) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank(),
        axis.text.y=element_blank(),
        axis.title.y=element_blank(),
        panel.background=element_rect(fill="white"),
        panel.grid=element_blank())

SIMPROF Analysis COI

#create hierarchical cluster diagram - example
dist.mat <- vegdist(vegan_matrix_COI, method="bray", binary=FALSE)
clust.res<-hclust(dist.mat, method="average")
plot(clust.res)

#SIMPROF: gives clusters that pass a significance threshold
########################################
simprof_mat_COI = simprof(vegan_matrix_COI, num.expected=100, num.simulated=99, 
                      method.cluster="complete", method.distance = "braycurtis", 
                      #method.transform = "squareroot",
                      alpha = 0.05, sample.orientation = "row")
## Warning: This version of the Bray-Curtis index does not use
## standardization.
## Warning: To use the standardized version, use "actual-braycurtis".
## Warning: See the help documentation for more information.
#visualize with colored hierarchical cluster diagram by sample name
plot_sim <- simprof.plot(simprof_mat_COI)

simprof_mat_COI
## $numgroups
## [1] 5
## 
## $pval
##       [,1] [,2]      [,3]
##  [1,]  -13  -15        NA
##  [2,]   -1   -3 1.0000000
##  [3,]   -4   -5 1.0000000
##  [4,]  -14  -18        NA
##  [5,]  -11  -12        NA
##  [6,]  -19    1        NA
##  [7,]  -20    4        NA
##  [8,]    5    7        NA
##  [9,]   -8   -9        NA
## [10,]  -16    6        NA
## [11,]   -7  -10        NA
## [12,]    9   11        NA
## [13,]   -2    2 0.0000000
## [14,]    8   10        NA
## [15,]   -6   12 0.3737374
## [16,]  -17   14 0.2020202
## [17,]    3   15 0.0000000
## [18,]   13   17 0.0000000
## [19,]   16   18 0.0000000
## 
## $hclust
## 
## Call:
## hclust(d = rawdata.dist, method = method.cluster)
## 
## Cluster method   : complete 
## Number of objects: 20 
## 
## 
## $significantclusters
## $significantclusters[[1]]
##  [1] "UC5"   "UC1"   "UC2"   "UC9"   "UC3_2" "UC6"   "UC4"   "UC7"  
##  [9] "UC3_1" "UC3"  
## 
## $significantclusters[[2]]
## [1] "CP23_2"
## 
## $significantclusters[[3]]
## [1] "CP23_1" "CP23"  
## 
## $significantclusters[[4]]
## [1] "GOC2a" "GOC2b"
## 
## $significantclusters[[5]]
## [1] "UC10" "UC13" "UC14" "UC12" "UC15"
#simprof has 4 objects (list of lists)
#how to call a significant cluster:
#simprof_mat[[4]][[1]]

#Map these results to an nmds plot
simprofCLUSTERS_COI = data.frame()

#only considers clusters of more than two samples
for(j in 1:length(simprof_mat_COI$significantclusters)){
  if(length(simprof_mat_COI$significantclusters[[j]]) > 1){
    simprofCLUSTERS_COI <- rbind(simprofCLUSTERS_COI, cbind(j, simprof_mat_COI$significantclusters[[j]]))
  }
}

#create df with samples as index and clusterID as "simprofCLUSTER"
rownames(simprofCLUSTERS_COI) <- simprofCLUSTERS_COI[,2]
colnames(simprofCLUSTERS_COI) <- c("simprofCLUSTER", "group")

simprofCLUSTERS_COI
##        simprofCLUSTER  group
## UC5                 1    UC5
## UC1                 1    UC1
## UC2                 1    UC2
## UC9                 1    UC9
## UC3_2               1  UC3_2
## UC6                 1    UC6
## UC4                 1    UC4
## UC7                 1    UC7
## UC3_1               1  UC3_1
## UC3                 1    UC3
## CP23_1              3 CP23_1
## CP23                3   CP23
## GOC2a               4  GOC2a
## GOC2b               4  GOC2b
## UC10                5   UC10
## UC13                5   UC13
## UC14                5   UC14
## UC12                5   UC12
## UC15                5   UC15

COI

#Plot with ggplot2
library(ggplot2)
library(ggdendro)

dendr <- dendro_data(simprof_mat_COI[[3]], type="rectangle") 

#Define cluster based on similarity percent - good for plotting
clust <- cutree(simprof_mat_COI[[3]], h = 40)               # find 'cut' clusters (k) or choose level (h) numeric scalar or vector with heights where the tree should be cut.
clust2 <- cutree(simprof_mat_COI[[3]], h = 60)
clust3 <- cutree(simprof_mat_COI[[3]], h = 20)  
clust4 <- cutree(simprof_mat_COI[[3]], h = 80)  

clust.df <- data.frame(label = names(clust), cluster_40 = clust, cluster_60 = clust2, cluster_20 = clust3, cluster_80 = clust4)
sapply(clust.df, mode)
##      label cluster_40 cluster_60 cluster_20 cluster_80 
##  "numeric"  "numeric"  "numeric"  "numeric"  "numeric"
clust.df$label <- as.character(clust.df$label)
clust.df <- clust.df[order(clust.df$label),]

#join sample data with cluster df and simprofCLUSTERS df:
tree_scores <- merge(clust.df, samp_dat_COI,by=0)
rownames(tree_scores) <- tree_scores$Row.names
#remove duplicated column after assigning index
tree_scores$Row.names <- NULL
#merge with simprof cluster df
tree_scores <- merge(tree_scores, simprofCLUSTERS_COI,by=0, all=TRUE )


#dendr$labels is df of x,y and attributes -> add to it with metadata (tree_scores)
tree_data_COI <- merge(dendr$labels, tree_scores,by="label")
tree_data_COI$Site <- tree_data_COI$Row.names

svglite::svglite("Figures/Dendro_COI_USEARCH.svg", height=4.5, width=4.5)

cbPalette <- c("#56B4E9", "#E69F00", "#009E73")

ggplot() + 
  geom_segment(data=segment(dendr), aes(x=x, y=y, xend=xend, yend=yend)) +
  geom_point(data=tree_data_COI, aes(x=x, y=y, color=Description, shape=Time_of_Day),alpha=1, size=5)+ 
  geom_text(data=tree_data_COI, aes(x=x, y=y, label=Site, hjust=-.4), size=3) +
  #geom_text(data=tree_data_COI, aes(x=x, y=y, label=simprofCLUSTER, hjust=3), size=5) +
  geom_hline(yintercept=20, color="blue", linetype = "longdash", alpha=0.8) +
  geom_hline(yintercept=40, color="green", linetype = "longdash", alpha=0.8) +
  geom_hline(yintercept=60, color="orange", linetype = "longdash", alpha=0.8) +
  coord_flip() + scale_y_reverse(expand=c(2, 0)) + 
  scale_colour_manual(values=cbPalette) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank(),
        axis.text.y=element_blank(),
        axis.title.y=element_blank(),
        panel.background=element_rect(fill="white"),
        panel.grid=element_blank())

dev.off()
## quartz_off_screen 
##                 2
ggplot() + 
  geom_segment(data=segment(dendr), aes(x=x, y=y, xend=xend, yend=yend)) +
  geom_point(data=tree_data_COI, aes(x=x, y=y, color=Description, shape=Time_of_Day),alpha=1, size=5)+ 
  geom_text(data=tree_data_COI, aes(x=x, y=y, label=Site, hjust=-.4), size=3) +
  #geom_text(data=tree_data_COI, aes(x=x, y=y, label=simprofCLUSTER, hjust=3), size=5) +
  geom_hline(yintercept=20, color="blue", linetype = "longdash", alpha=0.8) +
  geom_hline(yintercept=40, color="green", linetype = "longdash", alpha=0.8) +
  geom_hline(yintercept=60, color="orange", linetype = "longdash", alpha=0.8) +
  coord_flip() + scale_y_reverse(expand=c(2, 0)) + 
  scale_colour_manual(values=cbPalette) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank(),
        axis.text.y=element_blank(),
        axis.title.y=element_blank(),
        panel.background=element_rect(fill="white"),
        panel.grid=element_blank())

8 NMDS Analysis

Nonmetric Multidimensional Scaling

8.1 COI

#make NMDS plot
set.seed(12) #set seed if want reproducible plots, otherwise random
NMDS_analysis=metaMDS(vegan_matrix_COI, # Our community-by-species matrix
                      k=2) # The number of reduced dimensions
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.07604477 
## Run 1 stress 0.07173004 
## ... New best solution
## ... Procrustes: rmse 0.09850278  max resid 0.2304768 
## Run 2 stress 0.09297389 
## Run 3 stress 0.08926806 
## Run 4 stress 0.0724757 
## Run 5 stress 0.07237347 
## Run 6 stress 0.07266014 
## Run 7 stress 0.3647148 
## Run 8 stress 0.07044475 
## ... New best solution
## ... Procrustes: rmse 0.01937406  max resid 0.07070818 
## Run 9 stress 0.07267198 
## Run 10 stress 0.0724757 
## Run 11 stress 0.0726599 
## Run 12 stress 0.08096942 
## Run 13 stress 0.07267192 
## Run 14 stress 0.08926808 
## Run 15 stress 0.08120321 
## Run 16 stress 0.07044475 
## ... New best solution
## ... Procrustes: rmse 1.16031e-05  max resid 2.239007e-05 
## ... Similar to previous best
## Run 17 stress 0.0724757 
## Run 18 stress 0.07267192 
## Run 19 stress 0.08926845 
## Run 20 stress 0.08926851 
## *** Solution reached
stressplot(NMDS_analysis)

species.scores <- as.data.frame(scores(NMDS_analysis, "species"))  #Using the scores function from vegan to extract the species scores and convert to a data.frame
species.scores$species <- rownames(species.scores)  # create a column of species, represents OTU IDs
nmds_sp_scores <- merge(species.scores, tax_tab_COI,by="row.names", all=TRUE)
data.scores <- as.data.frame(scores(NMDS_analysis))  #Using the scores function from vegan to extract the site scores and convert to a data.frame
rownames(tree_data_COI) <- tree_data_COI$label
nmds_scores <- merge(data.scores, tree_data_COI,by="row.names", all=TRUE)

Rename NMDS data to save for later.

nmds_COI <- nmds_scores

Plot

ggplot() + 
  geom_point(data=nmds_COI,aes(x=NMDS1,y=NMDS2, colour=Description, shape=Treatment),size=7,alpha=0.9) + # add the point markers
  geom_text(data=nmds_COI,aes(x=NMDS1,y=NMDS2,label=Treatment),size=3,vjust=0.4) +  # add the site labels
  scale_colour_brewer(palette = "Dark2")+
  coord_equal() +
  theme_bw()

ggplot() + 
  stat_ellipse(data=nmds_COI,aes(x=NMDS1,y=NMDS2, colour=Description),type = "t", linetype=2) +
  stat_ellipse(data=nmds_COI,aes(x=NMDS1,y=NMDS2, fill=Description),geom= "polygon", alpha=0.2) +
  geom_point(data=nmds_COI,aes(x=NMDS1,y=NMDS2, colour=Description, shape=Treatment),size=7,alpha=0.9) + # add the point markers
  geom_text(data=nmds_COI,aes(x=NMDS1,y=NMDS2,label=Treatment),size=3,vjust=0.4) +  # add the site labels
  scale_colour_brewer(palette = "Dark2")+
  scale_fill_brewer(palette = "Dark2")+
  coord_equal() +
  theme_bw()

8.2 18S

#make NMDS plot
set.seed(2) #set seed if want reproducible plots, otherwise random
NMDS_analysis=metaMDS(vegan_matrix_18S, # Our community-by-species matrix
                      k=2) # The number of reduced dimensions
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.08363982 
## Run 1 stress 0.08514747 
## Run 2 stress 0.08514748 
## Run 3 stress 0.08363982 
## ... Procrustes: rmse 3.75827e-06  max resid 1.02521e-05 
## ... Similar to previous best
## Run 4 stress 0.1183835 
## Run 5 stress 0.1224329 
## Run 6 stress 0.08363982 
## ... New best solution
## ... Procrustes: rmse 4.181976e-06  max resid 1.10221e-05 
## ... Similar to previous best
## Run 7 stress 0.3252761 
## Run 8 stress 0.08363982 
## ... Procrustes: rmse 2.933452e-06  max resid 5.588491e-06 
## ... Similar to previous best
## Run 9 stress 0.08384065 
## ... Procrustes: rmse 0.005494003  max resid 0.01937741 
## Run 10 stress 0.1166501 
## Run 11 stress 0.09489886 
## Run 12 stress 0.11665 
## Run 13 stress 0.09218911 
## Run 14 stress 0.08363982 
## ... Procrustes: rmse 3.153746e-06  max resid 6.631076e-06 
## ... Similar to previous best
## Run 15 stress 0.0851475 
## Run 16 stress 0.1183836 
## Run 17 stress 0.08514748 
## Run 18 stress 0.08363982 
## ... Procrustes: rmse 1.326604e-06  max resid 3.080319e-06 
## ... Similar to previous best
## Run 19 stress 0.1166505 
## Run 20 stress 0.1166498 
## *** Solution reached
NMDS_analysis
## 
## Call:
## metaMDS(comm = vegan_matrix_18S, k = 2) 
## 
## global Multidimensional Scaling using monoMDS
## 
## Data:     wisconsin(sqrt(vegan_matrix_18S)) 
## Distance: bray 
## 
## Dimensions: 2 
## Stress:     0.08363982 
## Stress type 1, weak ties
## Two convergent solutions found after 20 tries
## Scaling: centring, PC rotation, halfchange scaling 
## Species: expanded scores based on 'wisconsin(sqrt(vegan_matrix_18S))'
stressplot(NMDS_analysis)

species.scores <- as.data.frame(scores(NMDS_analysis, "species"))  #Using the scores function from vegan to extract the species scores and convert to a data.frame
species.scores$species <- rownames(species.scores)  # create a column of species, represents OTU IDs
nmds_sp_scores <- merge(species.scores, tax_tab_18S,by="row.names", all=TRUE)
data.scores <- as.data.frame(scores(NMDS_analysis))  #Using the scores function from vegan to extract the site scores and convert to a data.frame
rownames(tree_data_18S) <- tree_data_18S$label
nmds_scores <- merge(data.scores, tree_data_18S,by="row.names", all=TRUE)
#nmds_sp_scores

save nmds data

nmds_18S <- nmds_scores

Plot

ggplot() + 
  geom_point(data=nmds_18S,aes(x=NMDS1,y=NMDS2, colour=Description, shape=Treatment),size=7,alpha=0.9) + # add the point markers
  geom_text(data=nmds_18S,aes(x=NMDS1,y=NMDS2,label=Treatment),size=3,vjust=0.4) +  # add the site labels
  scale_colour_brewer(palette = "Dark2")+
  coord_equal() +
  theme_bw()

ggplot() + 
  stat_ellipse(data=nmds_18S,aes(x=NMDS1,y=NMDS2, colour=Description),type = "t", linetype=2) +
  stat_ellipse(data=nmds_18S,aes(x=NMDS1,y=NMDS2, fill=Description),geom= "polygon", alpha=0.2) +
  geom_point(data=nmds_18S,aes(x=NMDS1,y=NMDS2, colour=Description, shape=Treatment),size=7,alpha=0.9) + # add the point markers
  geom_text(data=nmds_18S,aes(x=NMDS1,y=NMDS2,label=Treatment),size=3,vjust=0.4) +  # add the site labels
  scale_colour_brewer(palette = "Dark2")+
  scale_fill_brewer(palette = "Dark2")+
  coord_equal() +
  theme_bw()

9 Procrustes Analysis

Plot 18S and COI

ggplot() + 
  geom_point(data=nmds_18S,aes(x=NMDS1,y=NMDS2, colour=Description, shape=Treatment),size=7,alpha=0.9) + # add the point markers
  geom_text(data=nmds_18S,aes(x=NMDS1,y=NMDS2,label=site),size=3,vjust=0.4) +  # add the site labels
  scale_colour_brewer(palette = "Dark2")+
  coord_equal() +
  theme_bw()+ labs(title = "18S")

ggplot() + 
  geom_point(data=nmds_COI,aes(x=NMDS1,y=NMDS2, colour=Description, shape=Treatment),size=7,alpha=0.9) + # add the point markers
  geom_text(data=nmds_COI,aes(x=NMDS1,y=NMDS2,label=site),size=3,vjust=0.4) +  # add the site labels
  scale_colour_brewer(palette = "Dark2")+
  coord_equal() +
  theme_bw()+ labs(title = "COI")

#Procrustes Analysis

X <- subset(nmds_18S, select=c("NMDS1", "NMDS2"))
row.names(X) <- nmds_18S$Row.names
Y <- subset(nmds_COI, select=c("NMDS1", "NMDS2"))
row.names(Y) <- nmds_COI$Row.names
#For Geller data, remove GOC2b for 18S
Y <- Y[!rownames(Y) %in% c('GOC2b'), ]
test <- procrustes(X , Y, scale = TRUE, symmetric = FALSE, scores = "sites")

test1 <- plot(test, kind=1)

plot(test, kind=2)

summary(test)
## 
## Call:
## procrustes(X = X, Y = Y, scale = TRUE, symmetric = FALSE, scores = "sites") 
## 
## Number of objects: 19    Number of dimensions: 2 
## 
## Procrustes sum of squares:  
##  2.176319 
## Procrustes root mean squared error: 
##  0.3384422 
## Quantiles of Procrustes errors:
##        Min         1Q     Median         3Q        Max 
## 0.02703864 0.09749537 0.15417920 0.23960738 1.20115822 
## 
## Rotation matrix:
##            [,1]        [,2]
## [1,] 0.99990494 -0.01378832
## [2,] 0.01378832  0.99990494
## 
## Translation of averages:
##             [,1]        [,2]
## [1,] -0.03467968 0.005100198
## 
## Scaling of target:
## [1] 0.3495326
residuals(test)
##       CP23     CP23_1     CP23_2      GOC2a        UC1       UC10 
## 0.13842811 0.15456397 0.08889107 0.10609967 0.07705115 0.29009821 
##       UC12       UC13       UC14       UC15        UC2        UC3 
## 0.02703864 0.13524859 0.15417920 0.05204817 0.20569249 0.07813529 
##      UC3_1      UC3_2        UC4        UC5        UC6        UC7 
## 0.17327650 0.11717135 0.27352226 0.38555584 0.18897818 0.42939522 
##        UC9 
## 1.20115822

Plot with colors

heads.scores <- as.data.frame(scores(test1$heads))
heads.scores$label <- rownames(heads.scores)
heads.scores$Description <- nmds_18S[,c("Description")]

points.scores <- as.data.frame(scores(test1$points))
points.scores$label <- rownames(points.scores)
points.scores$Description <- nmds_18S[,c("Description")]

cbPalette <- c("#56B4E9", "#E69F00", "#009E73")
ggplot() + 
  geom_point(data=heads.scores,aes(x=NMDS1,y=NMDS2, colour=Description),shape=19,size=4,alpha=0.9) +
  geom_point(data=points.scores,aes(x=Dim1,y=Dim2, colour=Description),shape=17,size=4,alpha=0.9) +  # add the site labels
  geom_text(data=heads.scores,aes(x=NMDS1,y=NMDS2,label=label),size=3,vjust=1.8) +  # add the site labels
  geom_segment(aes(xend = heads.scores$NMDS1,yend = heads.scores$NMDS2,x = points.scores$Dim1,y = points.scores$Dim2),colour='black',alpha=0.6, size=0.2) +
  scale_colour_manual(values=cbPalette) +
  coord_equal() +
  theme_bw() + labs(title = "GOC clusters: ")

Plot with arrows

heads.scores <- as.data.frame(scores(test1$heads))
heads.scores$label <- rownames(heads.scores)
heads.scores$Description <- nmds_18S[,c("Description")]

points.scores <- as.data.frame(scores(test1$points))
points.scores$label <- rownames(points.scores)
points.scores$Description <- nmds_18S[,c("Description")]

# Plot with colors
svglite::svglite("Figures/Procrustes_COI_18S_USEARCH.svg", height=6, width=6)

cbPalette <- c("#56B4E9", "#E69F00", "#009E73")
ggplot() + 
  geom_point(data=heads.scores,aes(x=NMDS1,y=NMDS2, colour=Description),shape=19,size=4,alpha=0.9) +
  geom_point(data=points.scores,aes(x=Dim1,y=Dim2, colour=Description),shape=17,size=4,alpha=0.9) +  # add the site labels
  geom_text(data=heads.scores,aes(x=NMDS1,y=NMDS2,label=label),size=3,vjust=2) +  # add the site labels
  geom_segment(aes(xend = heads.scores$NMDS1,yend = heads.scores$NMDS2,x = points.scores$Dim1,y = points.scores$Dim2),colour='black',alpha=0.6, size=0.2) +
  scale_colour_manual(values=cbPalette) +
  coord_equal() +
  theme_bw() + labs(title = "GOC clusters: ")
dev.off()
## quartz_off_screen 
##                 2
# Plot with Arrows
ggplot() + 
  geom_point(data=heads.scores,aes(x=NMDS1,y=NMDS2),size=3,alpha=0.9) +
  geom_point(data=points.scores,aes(x=Dim1,y=Dim2),colour='red',size=3) +  # add the site labels
  geom_text(data=heads.scores,aes(x=NMDS1,y=NMDS2,label=label),size=3,vjust=1.8) +  # add the site labels
  geom_segment(aes(xend = heads.scores$NMDS1,yend = heads.scores$NMDS2,x = points.scores$Dim1,y = points.scores$Dim2),colour='black',arrow=arrow(length=unit(0.30,"cm")), size=0.5) +
  coord_equal() +
  theme_bw() + labs(title = "GOC clusters: ")

10 PERMANOVA

10.1 18S

# ADONIS test

# Calculate bray curtis distance matrix
B_bray <- phyloseq::distance(GOC_18S_r, method = "bray")

# make a data frame from the sample_data
sampledf <- data.frame(sample_data(GOC_18S_r))

# ADONIS calc
adonis_18S = adonis(B_bray ~ Description + Time_of_Day, data = sampledf)
adonis_18S
## 
## Call:
## adonis(formula = B_bray ~ Description + Time_of_Day, data = sampledf) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##             Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)    
## Description  2   1.47188 0.73594  8.3513 0.4839  0.001 ***
## Time_of_Day  2   0.33609 0.16805  1.9070 0.1105  0.073 .  
## Residuals   14   1.23372 0.08812         0.4056           
## Total       18   3.04169                 1.0000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Homogeneity of dispersion test
model<-betadisper(B_bray,sampledf$Time_of_Day)
permutest(model, pairwise = TRUE)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)
## Groups     2 0.07528 0.037639 1.7562    999  0.233
## Residuals 16 0.34292 0.021433                     
## 
## Pairwise comparisons:
## (Observed p-value below diagonal, permuted p-value above diagonal)
##                day   night transition
## day                0.21400      0.607
## night      0.20634              0.137
## transition 0.59622 0.11583
plot(model)

boxplot(model)

model<-betadisper(B_bray,sampledf$Description)
permutest(model, pairwise = TRUE)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df   Sum Sq  Mean Sq      F N.Perm Pr(>F)  
## Groups     2 0.082468 0.041234 5.9747    999  0.015 *
## Residuals 16 0.110423 0.006901                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Pairwise comparisons:
## (Observed p-value below diagonal, permuted p-value above diagonal)
##           EUNorth   EUSouth PCNorth
## EUNorth           0.2580000   0.115
## EUSouth 0.2646879             0.007
## PCNorth 0.1018828 0.0040554
plot(model)

boxplot(model)

10.2 COI

# ADONIS test

# Calculate bray curtis distance matrix
B_bray <- phyloseq::distance(GOC_COI_r, method = "bray")

# make a data frame from the sample_data
sampledf <- data.frame(sample_data(GOC_COI_r))

# ADONIS calc
adonis_COI = adonis(B_bray ~ Description + Time_of_Day, data = sampledf)
adonis_COI
## 
## Call:
## adonis(formula = B_bray ~ Description + Time_of_Day, data = sampledf) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##             Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Description  2    2.7076 1.35378  6.3401 0.42398  0.001 ***
## Time_of_Day  2    0.4756 0.23780  1.1137 0.07447  0.305    
## Residuals   15    3.2029 0.21353         0.50154           
## Total       19    6.3860                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Homogeneity of dispersion test
model<-betadisper(B_bray,sampledf$Time_of_Day)
permutest(model, pairwise = TRUE)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)   
## Groups     2 0.16822 0.084108 5.1586    999  0.007 **
## Residuals 17 0.27717 0.016304                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Pairwise comparisons:
## (Observed p-value below diagonal, permuted p-value above diagonal)
##                  day     night transition
## day                  0.0610000      0.288
## night      0.0631369                0.014
## transition 0.2531851 0.0098196
plot(model)

boxplot(model)

model<-betadisper(B_bray,sampledf$Description)
permutest(model, pairwise = TRUE)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)    
## Groups     2 0.20869 0.104344 14.486    999  0.001 ***
## Residuals 17 0.12245 0.007203                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Pairwise comparisons:
## (Observed p-value below diagonal, permuted p-value above diagonal)
##            EUNorth    EUSouth PCNorth
## EUNorth            1.5000e-02   0.076
## EUSouth 1.5759e-02              0.001
## PCNorth 8.8444e-02 7.0967e-05
plot(model)

boxplot(model)

11 Package citations

citation("ggplot2")
## 
## To cite ggplot2 in publications, please use:
## 
##   H. Wickham. ggplot2: Elegant Graphics for Data Analysis.
##   Springer-Verlag New York, 2016.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     author = {Hadley Wickham},
##     title = {ggplot2: Elegant Graphics for Data Analysis},
##     publisher = {Springer-Verlag New York},
##     year = {2016},
##     isbn = {978-3-319-24277-4},
##     url = {https://ggplot2.tidyverse.org},
##   }
citation("clustsig")
## 
## To cite package 'clustsig' in publications use:
## 
##   Douglas Whitaker and Mary Christman (2014). clustsig:
##   Significant Cluster Analysis. R package version 1.1.
##   https://CRAN.R-project.org/package=clustsig
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {clustsig: Significant Cluster Analysis},
##     author = {Douglas Whitaker and Mary Christman},
##     year = {2014},
##     note = {R package version 1.1},
##     url = {https://CRAN.R-project.org/package=clustsig},
##   }
## 
## ATTENTION: This citation information has been auto-generated from
## the package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
citation("phyloseq")
## 
## To cite phyloseq in publications, or otherwise credit, please use:
## 
##   phyloseq: An R package for reproducible interactive analysis and
##   graphics of microbiome census data. Paul J. McMurdie and Susan
##   Holmes (2013) PLoS ONE 8(4):e61217.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     author = {Paul J. McMurdie and Susan Holmes},
##     journal = {PLoS ONE},
##     pages = {e61217},
##     title = {phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data},
##     volume = {8},
##     number = {4},
##     year = {2013},
##     url = {http://dx.plos.org/10.1371/journal.pone.0061217},
##   }
citation("vegan")
## 
## To cite package 'vegan' in publications use:
## 
##   Jari Oksanen, F. Guillaume Blanchet, Michael Friendly, Roeland
##   Kindt, Pierre Legendre, Dan McGlinn, Peter R. Minchin, R. B.
##   O'Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. Stevens,
##   Eduard Szoecs and Helene Wagner (2019). vegan: Community Ecology
##   Package. R package version 2.5-5.
##   https://CRAN.R-project.org/package=vegan
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {vegan: Community Ecology Package},
##     author = {Jari Oksanen and F. Guillaume Blanchet and Michael Friendly and Roeland Kindt and Pierre Legendre and Dan McGlinn and Peter R. Minchin and R. B. O'Hara and Gavin L. Simpson and Peter Solymos and M. Henry H. Stevens and Eduard Szoecs and Helene Wagner},
##     year = {2019},
##     note = {R package version 2.5-5},
##     url = {https://CRAN.R-project.org/package=vegan},
##   }
## 
## ATTENTION: This citation information has been auto-generated from
## the package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
citation("dplyr")
## 
## To cite package 'dplyr' in publications use:
## 
##   Hadley Wickham, Romain François, Lionel Henry and Kirill Müller
##   (2019). dplyr: A Grammar of Data Manipulation. R package version
##   0.8.3. https://CRAN.R-project.org/package=dplyr
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {dplyr: A Grammar of Data Manipulation},
##     author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller},
##     year = {2019},
##     note = {R package version 0.8.3},
##     url = {https://CRAN.R-project.org/package=dplyr},
##   }
citation("biomformat")
## 
## To cite package 'biomformat' in publications use:
## 
##   Paul J. McMurdie and Joseph N Paulson (2019). biomformat: An
##   interface package for the BIOM file format.
##   https://github.com/joey711/biomformat/, http://biom-format.org/.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {biomformat: An interface package for the BIOM file format},
##     author = {Paul J. McMurdie and Joseph N Paulson},
##     year = {2019},
##     note = {https://github.com/joey711/biomformat/, http://biom-format.org/},
##   }
## 
## ATTENTION: This citation information has been auto-generated from
## the package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
citation("ggdendro")
## 
## To cite package 'ggdendro' in publications use:
## 
##   Andrie de Vries and Brian D. Ripley (2016). ggdendro: Create
##   Dendrograms and Tree Diagrams Using 'ggplot2'. R package version
##   0.1-20. https://CRAN.R-project.org/package=ggdendro
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {ggdendro: Create Dendrograms and Tree Diagrams Using 'ggplot2'},
##     author = {Andrie {de Vries} and Brian D. Ripley},
##     year = {2016},
##     note = {R package version 0.1-20},
##     url = {https://CRAN.R-project.org/package=ggdendro},
##   }
citation("ggpubr")
## 
## To cite package 'ggpubr' in publications use:
## 
##   Alboukadel Kassambara (2019). ggpubr: 'ggplot2' Based
##   Publication Ready Plots. R package version 0.2.1.
##   https://CRAN.R-project.org/package=ggpubr
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {ggpubr: 'ggplot2' Based Publication Ready Plots},
##     author = {Alboukadel Kassambara},
##     year = {2019},
##     note = {R package version 0.2.1},
##     url = {https://CRAN.R-project.org/package=ggpubr},
##   }
citation("car")
## 
## To cite the car package in publications use:
## 
##   John Fox and Sanford Weisberg (2019). An {R} Companion to
##   Applied Regression, Third Edition. Thousand Oaks CA: Sage. URL:
##   https://socialsciences.mcmaster.ca/jfox/Books/Companion/
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     title = {An {R} Companion to Applied Regression},
##     edition = {Third},
##     author = {John Fox and Sanford Weisberg},
##     year = {2019},
##     publisher = {Sage},
##     address = {Thousand Oaks {CA}},
##     url = {https://socialsciences.mcmaster.ca/jfox/Books/Companion/},
##   }
citation("multcomp")
## 
## Please cite the multcomp package by the following reference:
## 
##   Torsten Hothorn, Frank Bretz and Peter Westfall (2008).
##   Simultaneous Inference in General Parametric Models. Biometrical
##   Journal 50(3), 346--363.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Simultaneous Inference in General Parametric Models},
##     author = {Torsten Hothorn and Frank Bretz and Peter Westfall},
##     journal = {Biometrical Journal},
##     year = {2008},
##     volume = {50},
##     number = {3},
##     pages = {346--363},
##   }